knitr::opts_chunk$set(message=FALSE, warning=FALSE, eval=T, cache=F)

Intro

For today’s workshop, we’re going to use R to go through a typical bioinformatics analysis workflow. We’re going to use common bioinformatics techniques to visualize data and make beautiful figures.

The data we will analyze is breast cancer RNA-Seq data from TCGA, a popular publicly-available database for cancer-related datasets. The goal of the analysis will be to identify genes that show significant changes in expression between normal and tumor tissues, followed by identifying the pathways they are associated with. After importing the data and performing some data pre-processing, we will carry out differntial expression analysis and gene set enrichment analysis.

Main steps in today’s workshop:

  1. Import and pre-process RNA-Seq data
  2. Identify differentially-expressed genes between tumor and control samples
  3. Identify significantly-enriched pathways in the gene sets

Make sure to have the following packages installed for this workshop:

Working with Expression Set Objects

An expression set is a data object consisting of three entities: the expression matrix (exprs), the phenotye data (pData), and the feature data (fData).

We read in the RDS file included in this repo. It corresponds to a subset of samples from a gene expression dataset of breast cancer (BRCA) primary tissue samples from the TCGA project.

library(Biobase)
library(magrittr)
library(dplyr)
library(ggplot2)
library(Biobase)
library(ggfortify)
library(plotly)
brca <- readRDS("data/TCGA-BRCA.rds")

# dimensions of the expression data
dim(brca)
## Features  Samples 
##    36812     1222
# dimensions of the gene annotation
dim(fData(brca))
## [1] 36812     4
# first few rows of gene annotations
head(fData(brca)[,c("ensembl_transcript_id", "ensembl_gene_id", "hgnc_symbol")])
##          ensembl_transcript_id ensembl_gene_id hgnc_symbol
## TSPAN6      ENSG00000000003.13 ENSG00000000003      TSPAN6
## TNMD         ENSG00000000005.5 ENSG00000000005        TNMD
## DPM1        ENSG00000000419.11 ENSG00000000419        DPM1
## SCYL3       ENSG00000000457.12 ENSG00000000457       SCYL3
## C1orf112    ENSG00000000460.15 ENSG00000000460    C1orf112
## FGR         ENSG00000000938.11 ENSG00000000938         FGR
# dimensions of the phenotypic annotation
dim(pData(brca))
## [1] 1222   65
# first few rows of phenotype
head(pData(brca)[,c("patient_id", "sample_type", "tumor_subtype")])
##                                patient_id   sample_type tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085 Primary Tumor          LumB
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY Primary Tumor          LumA
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z Primary Tumor          LumB
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU Primary Tumor          LumA
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN Primary Tumor          Her2
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W Primary Tumor          LumB
# how many of each sample type?
table(pData(brca)$sample_type)
## 
##          Metastatic       Primary Tumor Solid Tissue Normal 
##                   7                1102                 113
# how many tumor subtypes?
table(pData(brca)$tumor_subtype)
## 
##  Basal   Her2   LumA   LumB Normal 
##    169    209    510    198     16

Log transform the data set

exprs(brca) <- log2(exprs(brca) + 1)
exprs(brca)[1:5,1:5]
##          TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A2-A0SY-01A-31R-A084-07
## TSPAN6                      13.975579                    10.981567
## TNMD                         1.584963                     6.189825
## DPM1                        11.156715                    10.822571
## SCYL3                       10.590587                    10.946906
## C1orf112                     9.519636                     9.339850
##          TCGA-AR-A24Z-01A-11R-A169-07 TCGA-D8-A1XU-01A-11R-A14M-07
## TSPAN6                      12.302353                    12.463013
## TNMD                         4.459432                     2.807355
## DPM1                        11.945444                    12.266494
## SCYL3                       10.611025                    11.149747
## C1orf112                     9.388017                     9.400879
##          TCGA-A1-A0SN-01A-11R-A144-07
## TSPAN6                       9.714246
## TNMD                         1.000000
## DPM1                        12.419960
## SCYL3                       11.136350
## C1orf112                     9.884171

PCA

Start by ranking genes based on their variation across samples

row.var <- sort(apply(exprs(brca), 1, var), decreasing=TRUE)
head(row.var)
##   CLEC3A  SCGB2A2     CPB1     TFF1  SCGB1D2    KCNJ3 
## 29.73892 25.49291 24.59669 21.00591 20.25785 19.56774

To save time, we’ll run PCA on the top 2500 most variable genes

df <- brca[names(row.var)[1:2500]] %>%
      exprs() %>%
      t() %>%
      data.frame()
    
pca <- prcomp(df)
pca.summary <- summary(pca)
pca.summary$importance[,1:5]
##                             PC1      PC2      PC3      PC4      PC5
## Standard deviation     47.42767 40.65232 29.32491 23.58570 20.28785
## Proportion of Variance  0.14934  0.10972  0.05709  0.03693  0.02733
## Cumulative Proportion   0.14934  0.25906  0.31615  0.35309  0.38041

2-D Plot

df$tumor_subtype <- brca$tumor_subtype
autoplot(pca, data=df, colour='tumor_subtype')

3-D Plot

df.pca <- cbind(pca$x[,c(1:3)], brca$tumor_subtype) %>%
          as.data.frame() %>%
          set_colnames(c("PC1", "PC2", "PC3", "tumor_subtype"))

head(df.pca)
##                                            PC1               PC2
## TCGA-A8-A085-01A-11R-A00Z-07 -89.9895594300379  14.4728151158205
## TCGA-A2-A0SY-01A-31R-A084-07  8.62013654000677 -39.5325958432916
## TCGA-AR-A24Z-01A-11R-A169-07 -41.8816734638114 -26.6304385642973
## TCGA-D8-A1XU-01A-11R-A14M-07 -22.8004029255938 -38.3033437021283
## TCGA-A1-A0SN-01A-11R-A144-07 -26.3108798601652  1.88579216258153
## TCGA-D8-A73W-01A-22R-A352-07 -46.7262990133077  6.82995550570086
##                                            PC3 tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07 -38.4190093513565          LumB
## TCGA-A2-A0SY-01A-31R-A084-07  7.54869728087252          LumA
## TCGA-AR-A24Z-01A-11R-A169-07  11.3252895436753          LumB
## TCGA-D8-A1XU-01A-11R-A14M-07  24.3111755582573          LumA
## TCGA-A1-A0SN-01A-11R-A144-07    21.69949616053          Her2
## TCGA-D8-A73W-01A-22R-A352-07 -12.9151476451899          LumB
p <- plot_ly(df.pca,
             x = ~PC1,
             y = ~PC2,
             z = ~PC3,
             type="scatter3d",
             mode = "markers",
             color = ~tumor_subtype,
             marker = list(size = 3))

p

Challenge: Do it in 4-D

Data Wrangling Questions

Example Dataset

genes <- c("FOXA1","MLPH","AR","GATA3","DNALI1","FAM47E","RHOB","SPDEF",
"SLC7A8","TTC6","CA12","SMIM14","C5AR2","SIDT1","NOSTRIN","CCDC125",
"FAM198B-AS1","TBC1D9","SLC44A4","DYNLRB2","THSD4","FAM214A","GTF2IP7","SLC22A5",
"CCDC170")

pcs <- brca[genes,] %>%
       exprs() %>%
       t() %>%
       data.frame() %>%
       prcomp() %>%
       .[["x"]]
    
df <- cbind(pcs[,"PC1",drop=F], pData(brca))

head(df)
##                                    PC1                      full_id
## TCGA-A8-A085-01A-11R-A00Z-07 -7.550408 TCGA-A8-A085-01A-11R-A00Z-07
## TCGA-A2-A0SY-01A-31R-A084-07 -6.881499 TCGA-A2-A0SY-01A-31R-A084-07
## TCGA-AR-A24Z-01A-11R-A169-07 -9.659289 TCGA-AR-A24Z-01A-11R-A169-07
## TCGA-D8-A1XU-01A-11R-A14M-07 -7.164274 TCGA-D8-A1XU-01A-11R-A14M-07
## TCGA-A1-A0SN-01A-11R-A144-07 -3.429342 TCGA-A1-A0SN-01A-11R-A144-07
## TCGA-D8-A73W-01A-22R-A352-07 -5.565336 TCGA-D8-A73W-01A-22R-A352-07
##                                patient_id        sample_id
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085 TCGA-A8-A085-01A
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY TCGA-A2-A0SY-01A
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z TCGA-AR-A24Z-01A
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU TCGA-D8-A1XU-01A
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN TCGA-A1-A0SN-01A
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W TCGA-D8-A73W-01A
##                                                           case_id
## TCGA-A8-A085-01A-11R-A00Z-07 3c08aabd-d5b5-4bbe-857c-38a7527b2163
## TCGA-A2-A0SY-01A-31R-A084-07 dc696e3c-f448-468f-a576-f4429be0338a
## TCGA-AR-A24Z-01A-11R-A169-07 9fefbe7c-f66a-4940-843e-285cb7b392c1
## TCGA-D8-A1XU-01A-11R-A14M-07 332148f5-f070-4c20-8eb1-4d8c0673aa52
## TCGA-A1-A0SN-01A-11R-A144-07 0dc337fa-da8b-42c4-b9a7-fb76d81c161f
## TCGA-D8-A73W-01A-22R-A352-07 ea8dbc7a-54c6-469c-865e-f49d00b0223d
##                              submitter_id project_id gender year_of_birth
## TCGA-A8-A085-01A-11R-A00Z-07 TCGA-A8-A085  TCGA-BRCA   male          1964
## TCGA-A2-A0SY-01A-31R-A084-07 TCGA-A2-A0SY  TCGA-BRCA female          1945
## TCGA-AR-A24Z-01A-11R-A169-07 TCGA-AR-A24Z  TCGA-BRCA female          1949
## TCGA-D8-A1XU-01A-11R-A14M-07 TCGA-D8-A1XU  TCGA-BRCA female          1954
## TCGA-A1-A0SN-01A-11R-A144-07 TCGA-A1-A0SN  TCGA-BRCA female          1957
## TCGA-D8-A73W-01A-22R-A352-07 TCGA-D8-A73W  TCGA-BRCA female          1934
##                                      race              ethnicity
## TCGA-A8-A085-01A-11R-A00Z-07 not reported           not reported
## TCGA-A2-A0SY-01A-31R-A084-07        white not hispanic or latino
## TCGA-AR-A24Z-01A-11R-A169-07        white           not reported
## TCGA-D8-A1XU-01A-11R-A14M-07        white not hispanic or latino
## TCGA-A1-A0SN-01A-11R-A144-07        white not hispanic or latino
## TCGA-D8-A73W-01A-22R-A352-07        white not hispanic or latino
##                              year_of_death classification_of_tumor
## TCGA-A8-A085-01A-11R-A00Z-07            --            not reported
## TCGA-A2-A0SY-01A-31R-A084-07            --            not reported
## TCGA-AR-A24Z-01A-11R-A169-07            --            not reported
## TCGA-D8-A1XU-01A-11R-A14M-07            --            not reported
## TCGA-A1-A0SN-01A-11R-A144-07            --            not reported
## TCGA-D8-A73W-01A-22R-A352-07            --            not reported
##                              last_known_disease_status
## TCGA-A8-A085-01A-11R-A00Z-07              not reported
## TCGA-A2-A0SY-01A-31R-A084-07              not reported
## TCGA-AR-A24Z-01A-11R-A169-07              not reported
## TCGA-D8-A1XU-01A-11R-A14M-07              not reported
## TCGA-A1-A0SN-01A-11R-A144-07              not reported
## TCGA-D8-A73W-01A-22R-A352-07              not reported
##                                             primary_diagnosis tumor_stage
## TCGA-A8-A085-01A-11R-A00Z-07 Infiltrating duct carcinoma, NOS   stage iib
## TCGA-A2-A0SY-01A-31R-A084-07           Lobular carcinoma, NOS  stage iiia
## TCGA-AR-A24Z-01A-11R-A169-07 Infiltrating duct carcinoma, NOS   stage iia
## TCGA-D8-A1XU-01A-11R-A14M-07 Infiltrating duct carcinoma, NOS    stage ia
## TCGA-A1-A0SN-01A-11R-A144-07 Infiltrating duct carcinoma, NOS   stage iia
## TCGA-D8-A73W-01A-22R-A352-07          Mucinous adenocarcinoma  stage iiia
##                              age_at_diagnosis vital_status morphology
## TCGA-A8-A085-01A-11R-A00Z-07            16377        alive     8500/3
## TCGA-A2-A0SY-01A-31R-A084-07            22928        alive     8520/3
## TCGA-AR-A24Z-01A-11R-A169-07            20900        alive     8500/3
## TCGA-D8-A1XU-01A-11R-A14M-07            20715        alive     8500/3
## TCGA-A1-A0SN-01A-11R-A144-07            18401        alive     8500/3
## TCGA-D8-A73W-01A-22R-A352-07            29125         dead     8480/3
##                              days_to_death
## TCGA-A8-A085-01A-11R-A00Z-07            --
## TCGA-A2-A0SY-01A-31R-A084-07            --
## TCGA-AR-A24Z-01A-11R-A169-07            --
## TCGA-D8-A1XU-01A-11R-A14M-07            --
## TCGA-A1-A0SN-01A-11R-A144-07            --
## TCGA-D8-A73W-01A-22R-A352-07         385.0
##                              days_to_last_known_disease_status
## TCGA-A8-A085-01A-11R-A00Z-07                                --
## TCGA-A2-A0SY-01A-31R-A084-07                                --
## TCGA-AR-A24Z-01A-11R-A169-07                                --
## TCGA-D8-A1XU-01A-11R-A14M-07                                --
## TCGA-A1-A0SN-01A-11R-A144-07                                --
## TCGA-D8-A73W-01A-22R-A352-07                                --
##                              days_to_recurrence  tumor_grade
## TCGA-A8-A085-01A-11R-A00Z-07                 -- not reported
## TCGA-A2-A0SY-01A-31R-A084-07                 -- not reported
## TCGA-AR-A24Z-01A-11R-A169-07                 -- not reported
## TCGA-D8-A1XU-01A-11R-A14M-07                 -- not reported
## TCGA-A1-A0SN-01A-11R-A144-07                 -- not reported
## TCGA-D8-A73W-01A-22R-A352-07                 -- not reported
##                              tissue_or_organ_of_origin days_to_birth
## TCGA-A8-A085-01A-11R-A00Z-07               Breast, NOS      -16377.0
## TCGA-A2-A0SY-01A-31R-A084-07               Breast, NOS      -22928.0
## TCGA-AR-A24Z-01A-11R-A169-07               Breast, NOS      -20900.0
## TCGA-D8-A1XU-01A-11R-A14M-07               Breast, NOS      -20715.0
## TCGA-A1-A0SN-01A-11R-A144-07               Breast, NOS      -18401.0
## TCGA-D8-A73W-01A-22R-A352-07               Breast, NOS      -29125.0
##                              progression_or_recurrence prior_malignancy
## TCGA-A8-A085-01A-11R-A00Z-07              not reported     not reported
## TCGA-A2-A0SY-01A-31R-A084-07              not reported     not reported
## TCGA-AR-A24Z-01A-11R-A169-07              not reported     not reported
## TCGA-D8-A1XU-01A-11R-A14M-07              not reported     not reported
## TCGA-A1-A0SN-01A-11R-A144-07              not reported     not reported
## TCGA-D8-A73W-01A-22R-A352-07              not reported     not reported
##                              site_of_resection_or_biopsy
## TCGA-A8-A085-01A-11R-A00Z-07                 Breast, NOS
## TCGA-A2-A0SY-01A-31R-A084-07                 Breast, NOS
## TCGA-AR-A24Z-01A-11R-A169-07                 Breast, NOS
## TCGA-D8-A1XU-01A-11R-A14M-07                 Breast, NOS
## TCGA-A1-A0SN-01A-11R-A144-07                 Breast, NOS
## TCGA-D8-A73W-01A-22R-A352-07                 Breast, NOS
##                              days_to_last_follow_up therapeutic_agents
## TCGA-A8-A085-01A-11R-A00Z-07                 1124.0                 --
## TCGA-A2-A0SY-01A-31R-A084-07                 1347.0                 --
## TCGA-AR-A24Z-01A-11R-A169-07                 3001.0                 --
## TCGA-D8-A1XU-01A-11R-A14M-07                  395.0                 --
## TCGA-A1-A0SN-01A-11R-A144-07                 1196.0                 --
## TCGA-D8-A73W-01A-22R-A352-07                  244.0                 --
##                              treatment_intent_type treatment_or_therapy
## TCGA-A8-A085-01A-11R-A00Z-07                    --                   --
## TCGA-A2-A0SY-01A-31R-A084-07                    --                   --
## TCGA-AR-A24Z-01A-11R-A169-07                    --                   --
## TCGA-D8-A1XU-01A-11R-A14M-07                    --                   --
## TCGA-A1-A0SN-01A-11R-A144-07                    --                   --
## TCGA-D8-A73W-01A-22R-A352-07                    --                   --
##                              sample_submitter_id case_submitter_id
## TCGA-A8-A085-01A-11R-A00Z-07    TCGA-A8-A085-01A      TCGA-A8-A085
## TCGA-A2-A0SY-01A-31R-A084-07    TCGA-A2-A0SY-01A      TCGA-A2-A0SY
## TCGA-AR-A24Z-01A-11R-A169-07    TCGA-AR-A24Z-01A      TCGA-AR-A24Z
## TCGA-D8-A1XU-01A-11R-A14M-07    TCGA-D8-A1XU-01A      TCGA-D8-A1XU
## TCGA-A1-A0SN-01A-11R-A144-07    TCGA-A1-A0SN-01A      TCGA-A1-A0SN
## TCGA-D8-A73W-01A-22R-A352-07    TCGA-D8-A73W-01A      TCGA-D8-A73W
##                              sample_type_id
## TCGA-A8-A085-01A-11R-A00Z-07              1
## TCGA-A2-A0SY-01A-31R-A084-07              1
## TCGA-AR-A24Z-01A-11R-A169-07              1
## TCGA-D8-A1XU-01A-11R-A14M-07              1
## TCGA-A1-A0SN-01A-11R-A144-07              1
## TCGA-D8-A73W-01A-22R-A352-07              1
##                              time_between_excision_and_freezing
## TCGA-A8-A085-01A-11R-A00Z-07                                 --
## TCGA-A2-A0SY-01A-31R-A084-07                                 --
## TCGA-AR-A24Z-01A-11R-A169-07                                 --
## TCGA-D8-A1XU-01A-11R-A14M-07                                 --
## TCGA-A1-A0SN-01A-11R-A144-07                                 --
## TCGA-D8-A73W-01A-22R-A352-07                                 --
##                              oct_embedded tumor_code_id
## TCGA-A8-A085-01A-11R-A00Z-07        false            --
## TCGA-A2-A0SY-01A-31R-A084-07         true            --
## TCGA-AR-A24Z-01A-11R-A169-07         true            --
## TCGA-D8-A1XU-01A-11R-A14M-07        false            --
## TCGA-A1-A0SN-01A-11R-A144-07         true            --
## TCGA-D8-A73W-01A-22R-A352-07        false            --
##                              intermediate_dimension is_ffpe
## TCGA-A8-A085-01A-11R-A00Z-07                     --   False
## TCGA-A2-A0SY-01A-31R-A084-07                     --   False
## TCGA-AR-A24Z-01A-11R-A169-07                     --   False
## TCGA-D8-A1XU-01A-11R-A14M-07                     --   False
## TCGA-A1-A0SN-01A-11R-A144-07                     --   False
## TCGA-D8-A73W-01A-22R-A352-07                     --   False
##                                             pathology_report_uuid
## TCGA-A8-A085-01A-11R-A00Z-07 64F84FF4-A477-4E1E-B4BB-E5614517229E
## TCGA-A2-A0SY-01A-31R-A084-07 8E6902A6-A673-46CC-9AEB-3A71EF11099F
## TCGA-AR-A24Z-01A-11R-A169-07 AD07F611-0EEA-4890-A02C-6DA3F5F57C45
## TCGA-D8-A1XU-01A-11R-A14M-07 845F8FCF-CF3C-4CEF-B673-A57DE626939C
## TCGA-A1-A0SN-01A-11R-A144-07 D0269758-EFAE-4EBA-8CCF-4A6CF4D4B35A
## TCGA-D8-A73W-01A-22R-A352-07 359DB5F2-BD23-42E1-B316-9D908DBACD78
##                              tumor_descriptor   sample_type
## TCGA-A8-A085-01A-11R-A00Z-07               -- Primary Tumor
## TCGA-A2-A0SY-01A-31R-A084-07               -- Primary Tumor
## TCGA-AR-A24Z-01A-11R-A169-07               -- Primary Tumor
## TCGA-D8-A1XU-01A-11R-A14M-07               -- Primary Tumor
## TCGA-A1-A0SN-01A-11R-A144-07               -- Primary Tumor
## TCGA-D8-A73W-01A-22R-A352-07               -- Primary Tumor
##                              distance_normal_to_tumor
## TCGA-A8-A085-01A-11R-A00Z-07                 released
## TCGA-A2-A0SY-01A-31R-A084-07                 released
## TCGA-AR-A24Z-01A-11R-A169-07                 released
## TCGA-D8-A1XU-01A-11R-A14M-07                 released
## TCGA-A1-A0SN-01A-11R-A144-07                 released
## TCGA-D8-A73W-01A-22R-A352-07                 released
##                              biospecimen_anatomic_site state
## TCGA-A8-A085-01A-11R-A00Z-07                        --    --
## TCGA-A2-A0SY-01A-31R-A084-07                        --    --
## TCGA-AR-A24Z-01A-11R-A169-07                        --    --
## TCGA-D8-A1XU-01A-11R-A14M-07                        --    --
## TCGA-A1-A0SN-01A-11R-A144-07                        --    --
## TCGA-D8-A73W-01A-22R-A352-07                        --    --
##                              diagnosis_pathologically_confirmed
## TCGA-A8-A085-01A-11R-A00Z-07                                 --
## TCGA-A2-A0SY-01A-31R-A084-07                                 --
## TCGA-AR-A24Z-01A-11R-A169-07                                 --
## TCGA-D8-A1XU-01A-11R-A14M-07                                 --
## TCGA-A1-A0SN-01A-11R-A144-07                                 --
## TCGA-D8-A73W-01A-22R-A352-07                                 --
##                              current_weight composition
## TCGA-A8-A085-01A-11R-A00Z-07             --          --
## TCGA-A2-A0SY-01A-31R-A084-07             --          --
## TCGA-AR-A24Z-01A-11R-A169-07             --          --
## TCGA-D8-A1XU-01A-11R-A14M-07             --          --
## TCGA-A1-A0SN-01A-11R-A144-07             --          --
## TCGA-D8-A73W-01A-22R-A352-07             --          --
##                              time_between_clamping_and_freezing
## TCGA-A8-A085-01A-11R-A00Z-07                                 --
## TCGA-A2-A0SY-01A-31R-A084-07                                 --
## TCGA-AR-A24Z-01A-11R-A169-07                                 --
## TCGA-D8-A1XU-01A-11R-A14M-07                                 --
## TCGA-A1-A0SN-01A-11R-A144-07                                 --
## TCGA-D8-A73W-01A-22R-A352-07                                 --
##                              distributor_reference shortest_dimension
## TCGA-A8-A085-01A-11R-A00Z-07                    --                 --
## TCGA-A2-A0SY-01A-31R-A084-07                    --                 --
## TCGA-AR-A24Z-01A-11R-A169-07                    --                 --
## TCGA-D8-A1XU-01A-11R-A14M-07                    --                 --
## TCGA-A1-A0SN-01A-11R-A144-07                    --                 --
## TCGA-D8-A73W-01A-22R-A352-07                    --                 --
##                              method_of_sample_procurement tumor_code
## TCGA-A8-A085-01A-11R-A00Z-07                           --        788
## TCGA-A2-A0SY-01A-31R-A084-07                           --       1083
## TCGA-AR-A24Z-01A-11R-A169-07                           --       1673
## TCGA-D8-A1XU-01A-11R-A14M-07                           --        102
## TCGA-A1-A0SN-01A-11R-A144-07                           --       1091
## TCGA-D8-A73W-01A-22R-A352-07                           --        191
##                              passage_count tissue_type
## TCGA-A8-A085-01A-11R-A00Z-07         130.0          --
## TCGA-A2-A0SY-01A-31R-A084-07         510.0          --
## TCGA-AR-A24Z-01A-11R-A169-07         120.0          --
## TCGA-D8-A1XU-01A-11R-A14M-07         210.0          --
## TCGA-A1-A0SN-01A-11R-A144-07         120.0          --
## TCGA-D8-A73W-01A-22R-A352-07         230.0          --
##                              biospecimen_laterality
## TCGA-A8-A085-01A-11R-A00Z-07                     --
## TCGA-A2-A0SY-01A-31R-A084-07                     --
## TCGA-AR-A24Z-01A-11R-A169-07                     --
## TCGA-D8-A1XU-01A-11R-A14M-07                     --
## TCGA-A1-A0SN-01A-11R-A144-07                     --
## TCGA-D8-A73W-01A-22R-A352-07                     --
##                              days_to_sample_procurement freezing_method
## TCGA-A8-A085-01A-11R-A00Z-07                         --              --
## TCGA-A2-A0SY-01A-31R-A084-07                         --              --
## TCGA-AR-A24Z-01A-11R-A169-07                         --              --
## TCGA-D8-A1XU-01A-11R-A14M-07                         --              --
## TCGA-A1-A0SN-01A-11R-A144-07                         --              --
## TCGA-D8-A73W-01A-22R-A352-07                         --              --
##                              preservation_method growth_rate
## TCGA-A8-A085-01A-11R-A00Z-07                  --          --
## TCGA-A2-A0SY-01A-31R-A084-07                  --          --
## TCGA-AR-A24Z-01A-11R-A169-07                  --          --
## TCGA-D8-A1XU-01A-11R-A14M-07                  --          --
## TCGA-A1-A0SN-01A-11R-A144-07                  --          --
## TCGA-D8-A73W-01A-22R-A352-07                  --          --
##                              days_to_collection catalog_reference
## TCGA-A8-A085-01A-11R-A00Z-07                 --                --
## TCGA-A2-A0SY-01A-31R-A084-07                 --                --
## TCGA-AR-A24Z-01A-11R-A169-07                 --                --
## TCGA-D8-A1XU-01A-11R-A14M-07                 --                --
## TCGA-A1-A0SN-01A-11R-A144-07                 --                --
## TCGA-D8-A73W-01A-22R-A352-07                 --                --
##                              initial_weight longest_dimension
## TCGA-A8-A085-01A-11R-A00Z-07             --                --
## TCGA-A2-A0SY-01A-31R-A084-07             --                --
## TCGA-AR-A24Z-01A-11R-A169-07             --                --
## TCGA-D8-A1XU-01A-11R-A14M-07             --                --
## TCGA-A1-A0SN-01A-11R-A144-07             --                --
## TCGA-D8-A73W-01A-22R-A352-07             --                --
##                              tumor_subtype
## TCGA-A8-A085-01A-11R-A00Z-07          LumB
## TCGA-A2-A0SY-01A-31R-A084-07          LumA
## TCGA-AR-A24Z-01A-11R-A169-07          LumB
## TCGA-D8-A1XU-01A-11R-A14M-07          LumA
## TCGA-A1-A0SN-01A-11R-A144-07          Her2
## TCGA-D8-A73W-01A-22R-A352-07          LumB

Example

ex1 <- df %>%
       filter(!is.na(tumor_subtype)) %>%
       group_by(tumor_subtype) %>%
       summarize(PC1_mean = mean(PC1, na.rm=T))

print(ex1)
## # A tibble: 5 x 2
##   tumor_subtype PC1_mean
##   <chr>            <dbl>
## 1 Basal           16.8  
## 2 Her2            -0.413
## 3 LumA            -4.57 
## 4 LumB            -4.23 
## 5 Normal          10.4
ggplot(data=ex1, aes(x=tumor_subtype, y=PC1_mean)) +
geom_point(aes(shape=tumor_subtype, color=tumor_subtype), size=10)+
ylab("PC1  ~ Basal Genes") +
xlab("Tumor Subtype") +
labs(title="Important Genes", 
     subtitle="Average PC1 Across Tumor Subtypes",
     caption="Figure 1: Example of data wrangling into ggplot")

Q1

Make a boxplot of PC1 across tumor subtypes.

Q2

Make a scatter of PC1 across tumor subtypes.

Q3

Make a boxplot of PC1 across tumor stages within each gender.

Q4

Make a density plot of PC1 across tumor stages within females.

Q5

Make a correlation plot of PC1 with age.